# 爬取豆瓣 Top250 数据

import requests
import re
import csv

if __name__ == "__main__":

    url = 'https://movie.douban.com/top250'
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
    }
    resp = requests.get(url, headers=headers)
    page_content = resp.text
    resp.close()

    print(page_content)

    # 解析数据：(?P<分组名字>正则表达式) — — 可以当都从正则匹配的内容中进一步提取内容
    obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<title>.*?)</span>'
                     r'.*?<p class="">.*?<br>(?P<year>.*?)&nbsp;/&nbsp;'
                     r'.*?<span class="rating_num" property="v:average">(?P<score>.*?)</span>'
                     r'.*?<span>(?P<pNum>.*?)</span>'
                     , re.S)
    result = obj.finditer(page_content)

    file = open('../爬虫文件/08.爬取豆瓣Top250-data.csv', mode='w', encoding='utf-8')
    csvwriter = csv.writer(file)
    for res in result:
        print(res.group("title") + '  ' + res.group("year").strip() + '  ' + res.group("score") + '  ' + res.group("pNum"))
        dic = res.groupdict()
        dic['year'] = dic['year'].strip()
        csvwriter.writerow(dic.values())

    print('over!')



